{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# dataframe的创建方式\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "# 通过series来创建\n",
    "s1 = pd.Series([1,2,3,4,5])\n",
    "s2 = pd.Series([6,7,8,9,10])\n",
    "df = pd.DataFrame({\"第1列\":s1,\"第2列\":s2})\n",
    "# 通过字典来创建\n",
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"name\":[\"tom\",'jack','alice','bob','allen'],\n",
    "        \"age\":[15,17,20,26,30],\n",
    "        \"score\":[60.5,80,30.6,70,83.5]\n",
    "    },index=[1,2,3,4,5],columns=[\"name\",\"score\",\"age\"]\n",
    ")\n",
    "df"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# dataframe的属性\n",
    "print('行索引：')\n",
    "print(df.index)\n",
    "print('列标签：')\n",
    "print(df.columns)\n",
    "print('值')\n",
    "print(df.values)"
   ],
   "id": "a1f18b6e5a11e09b",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "print('维度：',df.ndim)\n",
    "print('形状:', df.shape)\n",
    "print('元素个数：', df.size)\n",
    "print('数据类型：')\n",
    "print(df.dtypes)"
   ],
   "id": "20870b9745d318c9",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 行列转置\n",
    "print(df.T)"
   ],
   "id": "bc1dc91f326be71e",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 获取元素 loc  iloc  at  iat\n",
    "# 某行\n",
    "print(df.loc[4])\n",
    "print(df.iloc[3])"
   ],
   "id": "1481623e71cb1958",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 某列\n",
    "print(df.loc[:,'name'])\n",
    "print(df.iloc[:,0])"
   ],
   "id": "ed9745f7e0f57ad9",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 单个元素\n",
    "print(df.at[3,'score'])\n",
    "print(df.iat[2,1])\n",
    "print(df.loc[3,'score'])\n",
    "print(df.iloc[2,1])"
   ],
   "id": "a9d2076538ffce0f",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 获取单列数据\n",
    "print(df['name'])\n",
    "print(type(df['name']))\n",
    "print(df.name)\n",
    "print(type(df.name))\n",
    "print(df[['name']])\n",
    "print(type(df[['name']]))\n",
    "df[['name']]"
   ],
   "id": "5f1167aa3ee04168",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df[['name','score']]) # 多列数据的获取",
   "id": "a8e01c435646d97c",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 查看部分数据\n",
    "print(df.head(2))"
   ],
   "id": "9c1a94913398664b",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.tail(3))",
   "id": "9e340a4305967208",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 使用布尔索引筛选数据\n",
    "df[df.score>70]\n",
    "df[ (df['score']>70) & (df.age<20)]"
   ],
   "id": "317801278afa0406",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# 随机抽样\n",
    "df.sample(3)"
   ],
   "id": "e60085a9ffadff71",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"name\":[\"tom\",'jack','alice','bob','allen'],\n",
    "        \"age\":[15,17,20,26,30],\n",
    "        \"score\":[60.5,80,30.6,70,83.5]\n",
    "    },index=[1,2,3,4,5],columns=[\"name\",\"score\",\"age\"]\n",
    ")"
   ],
   "id": "c3daabb064001eed",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "print(df.head()) #查看前n行数据，默认是5行\n",
    "print(df.tail(1))  #查看后n行数据，默认是5行"
   ],
   "id": "d47019bb0a81bd32",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.isin(['jack',20]))  #查看元素是否包含在参数集合中",
   "id": "13cbf573c29edaf8",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.isna()) # 查看元素是否是缺失值",
   "id": "e6684f16b6e6ca3f",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "print(df['score'].sum())  #某一列的总和\n",
    "print(df.score.max())  #最大值\n",
    "print(df.age.min())  #最小值\n",
    "print(df.score.mean())  #平均数\n",
    "print(df.score.median())  #中位数\n",
    "print(df.age.mode())  #众数"
   ],
   "id": "276567dab4a76147",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-10T13:31:56.355975Z",
     "start_time": "2025-08-10T13:31:56.344100Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"name\":[\"tom\",\"tom\",'jack','alice','bob','allen'],\n",
    "        \"age\":[15,15,15,20,26,30],\n",
    "        \"score\":[60.5,60.5,80,30.6,70,83.5]\n",
    "    },index=[1,2,3,4,5,6],columns=[\"name\",\"score\",\"age\"]\n",
    ")\n",
    "print(df)"
   ],
   "id": "d6b44c146a0012f9",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    name  score  age\n",
      "1    tom   60.5   15\n",
      "2    tom   60.5   15\n",
      "3   jack   80.0   15\n",
      "4  alice   30.6   20\n",
      "5    bob   70.0   26\n",
      "6  allen   83.5   30\n"
     ]
    }
   ],
   "execution_count": 6
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "print(df.score.std())  #标准差\n",
    "print(df.score.var()) #方差\n",
    "print(df.score.quantile(0.25))  #分位数"
   ],
   "id": "6f62acec0cac179c",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.describe())",
   "id": "88be616844f2911a",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.count())  #每一列非缺失值的个数",
   "id": "1b535499829627da",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-10T13:31:36.355314Z",
     "start_time": "2025-08-10T13:31:36.338218Z"
    }
   },
   "cell_type": "code",
   "source": "print(df.value_counts()) #出现的次数",
   "id": "dc43f1390cb84d0c",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "name   score  age\n",
      "tom    60.5   15     2\n",
      "alice  30.6   20     1\n",
      "allen  83.5   30     1\n",
      "bob    70.0   26     1\n",
      "jack   80.0   15     1\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-08-10T13:31:43.312708Z",
     "start_time": "2025-08-10T13:31:43.303607Z"
    }
   },
   "cell_type": "code",
   "source": "print(df.drop_duplicates())",
   "id": "915509acf6f8f90d",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    name  score  age\n",
      "1    tom   60.5   15\n",
      "3   jack   80.0   15\n",
      "4  alice   30.6   20\n",
      "5    bob   70.0   26\n",
      "6  allen   83.5   30\n"
     ]
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.duplicated(subset=['age']))  #查看是否重复",
   "id": "97d678fc6de6fedd",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "df.sample(2) #随机抽样",
   "id": "ccfcaaf99ea04747",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.replace(15,30))",
   "id": "5c417727c859d945",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df.cumsum()\n",
    "df.cummin(axis=0)"
   ],
   "id": "ad282748e2c0bee2",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.sort_index(ascending=False))",
   "id": "d4f05faac6cf6731",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.sort_values(by='score'))",
   "id": "d33934554548ee04",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"name\":[\"tom\",\"tom\",'jack','alice','bob','allen'],\n",
    "        \"age\":[15,15,15,20,26,30],\n",
    "        \"score\":[60.5,60.5,80,30.6,70,80]\n",
    "    },index=[1,2,3,4,5,6],columns=[\"name\",\"score\",\"age\"]\n",
    ")"
   ],
   "id": "e85bb3b0941ced0c",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "print(df.sort_values(by=['score','age'],ascending=[True,False]))",
   "id": "f21d3b86390a0576",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "df.nlargest(2,columns=['score','age'])\n",
    "df.nsmallest(2,columns=['score','age'])"
   ],
   "id": "521cf10132bc12c",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "'''\n",
    "案例1：学生成绩分析\n",
    "场景：某班级的学生成绩数据如下，请完成以下任务：\n",
    "1. 计算每位学生的总分和平均分。\n",
    "2. 找出数学成绩高于90分或英语成绩高于85分的学生。\n",
    "3. 按总分从高到低排序，并输出前3名学生。\n",
    "'''\n",
    "import pandas as pd\n",
    "data = {\n",
    "    '姓名': ['张三', '李四', '王五', '赵六', '钱七'],\n",
    "    '数学': [85, 92, 78, 88, 95],\n",
    "    '英语': [90, 88, 85, 92, 80],\n",
    "    '物理': [75, 80, 88, 85, 90]\n",
    "}\n",
    "scores = pd.DataFrame(data)\n",
    "\n",
    "#1. 计算每位学生的总分和平均分。\n",
    "scores['总分'] = scores[['数学','英语','物理']].sum(axis=1)\n",
    "scores['平均分'] = scores['总分'] / 3\n",
    "scores['平均分2'] = scores[['数学','英语','物理']].mean(axis=1)\n",
    "#2. 找出数学成绩高于90分或英语成绩高于85分的学生。\n",
    "scores[ (scores['数学']>90 ) | (scores['英语']>85 )  ]\n",
    "#3. 按总分从高到低排序，并输出前3名学生。\n",
    "r1 = scores.sort_values('总分',ascending=False).head(3)\n",
    "r2 =scores.nlargest(3,columns=['总分'])\n",
    "print(r1)\n",
    "print(r2)"
   ],
   "id": "d3f1d8f7c4b989e1",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "'''\n",
    "案例2：销售数据分析\n",
    "场景：某公司销售数据如下，请完成以下任务：\n",
    "1. 计算每种产品的总销售额（销售额 = 单价 × 销量）。\n",
    "2. 找出销售额最高的产品。\n",
    "3. 按销售额从高到低排序，并输出所有产品信息。\n",
    "'''\n",
    "import pandas as pd\n",
    "\n",
    "data = {\n",
    "    '产品名称': ['A', 'B', 'C', 'D'],\n",
    "    '单价': [100, 150, 200, 120],\n",
    "    '销量': [50, 30, 20, 40]\n",
    "}\n",
    "df = pd.DataFrame(data)\n",
    "df['总销售额'] = df['单价']*df['销量']\n",
    "df.nlargest(1,columns=['总销售额'])\n",
    "df.sort_values('总销售额',ascending=False)"
   ],
   "id": "6d10ab8603957148",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-28T15:05:45.773710Z",
     "start_time": "2025-06-28T15:05:45.737500Z"
    }
   },
   "cell_type": "code",
   "source": [
    "'''案例3：电商用户行为分析\n",
    "场景：某电商平台的用户行为数据如下，请完成以下任务：\n",
    "1. 计算每位用户的总消费金额（消费金额 = 商品单价 × 购买数量）\n",
    "2. 找出消费金额最高的用户，并输出其所有信息\n",
    "3. 计算所有用户的平均消费金额（保留2位小数）\n",
    "4. 统计电子产品的总购买数量\n",
    "'''\n",
    "import pandas as pd\n",
    "\n",
    "data = {\n",
    "    '用户ID': [101, 102, 103, 104, 105],\n",
    "    '用户名': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],\n",
    "    '商品类别': ['电子产品', '服饰', '电子产品', '家居', '服饰'],\n",
    "    '商品单价': [1200, 300, 800, 150, 200],\n",
    "    '购买数量': [1, 3, 2, 5, 4]\n",
    "}\n",
    "df = pd.DataFrame(data)\n",
    "df['总消费金额'] = df['商品单价']*df['购买数量']\n",
    "df.nlargest(1,columns=['总消费金额'])\n",
    "df['总消费金额'].mean()\n",
    "df[df['商品类别']=='电子产品']['购买数量'].sum()"
   ],
   "id": "e1ada166c381f91",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.int64(3)"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 163
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-29T17:53:31.000461Z",
     "start_time": "2025-06-29T17:53:30.909675Z"
    }
   },
   "cell_type": "code",
   "source": [
    "#数据变形\n",
    "import pandas as pd\n",
    "data = {\n",
    "    'ID': [1, 2],\n",
    "    'name':['张三','李四'],\n",
    "    'Math': [90, 85],\n",
    "    'English': [88, 92],\n",
    "    'Science': [95, 89]\n",
    "}\n",
    "df = pd.DataFrame(data)\n",
    "df\n",
    "df.T\n",
    "#宽表转长表\n",
    "df2= pd.melt(df, id_vars=['ID','name'], var_name='科目', value_name='分数')\n",
    "df2.sort_values(by=['name','科目'])\n",
    "#长表转宽表\n",
    "df3=pd.pivot(df2,index=['ID','name'],columns=['科目'],values='分数')\n",
    "#分列\n",
    "data = {\n",
    "    'ID': [1, 2],\n",
    "    'name':['alice smith','bob jack'],\n",
    "    'Math': [90, 85],\n",
    "    'English': [88, 92],\n",
    "    'Science': [95, 89]\n",
    "}\n",
    "df = pd.DataFrame(data)\n",
    "df[['first name','last name']] = df['name'].str.split(' ')\n",
    "# 加载数据\n",
    "df = pd.read_csv(\"data/sleep.csv\")\n",
    "\n",
    "df=df[['person_id','blood_pressure']]\n",
    "df[['high','low']]=df['blood_pressure'].str.split('/',expand=True)\n",
    "df"
   ],
   "id": "ece55929fc3c7b91",
   "outputs": [],
   "execution_count": 29
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-29T18:19:05.053632Z",
     "start_time": "2025-06-29T18:19:04.999137Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 加载数据\n",
    "df_employees = pd.read_csv(\"data/employees.csv\")\n",
    "df_employees\n"
   ],
   "id": "2910d88913b9bd97",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     employee_id first_name last_name     email  phone_number      job_id  \\\n",
       "0            100     Steven      King     SKING  515.123.4567     AD_PRES   \n",
       "1            101      N_ann   Kochhar  NKOCHHAR  515.123.4568       AD_VP   \n",
       "2            102        Lex   De Haan   LDEHAAN  515.123.4569       AD_VP   \n",
       "3            103  Alexander    Hunold   AHUNOLD  590.423.4567     IT_PROG   \n",
       "4            104      Bruce     Ernst    BERNST  590.423.4568     IT_PROG   \n",
       "..           ...        ...       ...       ...           ...         ...   \n",
       "102          202        Pat       Fay      PFAY  603.123.6666      MK_REP   \n",
       "103          203      Susan    Mavris   SMAVRIS  515.123.7777      HR_REP   \n",
       "104          204    Hermann      Baer     HBAER  515.123.8888      PR_REP   \n",
       "105          205    Shelley   Higgins  SHIGGINS  515.123.8080      AC_MGR   \n",
       "106          206    William     Gietz    WGIETZ  515.123.8181  AC_ACCOUNT   \n",
       "\n",
       "      salary  commission_pct  manager_id  department_id  \n",
       "0    24000.0             NaN         NaN           90.0  \n",
       "1    17000.0             NaN       100.0           90.0  \n",
       "2    17000.0             NaN       100.0           90.0  \n",
       "3     9000.0             NaN       102.0           60.0  \n",
       "4     6000.0             NaN       103.0           60.0  \n",
       "..       ...             ...         ...            ...  \n",
       "102   6000.0             NaN       201.0           20.0  \n",
       "103   6500.0             NaN       101.0           40.0  \n",
       "104  10000.0             NaN       101.0           70.0  \n",
       "105  12000.0             NaN       101.0          110.0  \n",
       "106   8300.0             NaN       205.0          110.0  \n",
       "\n",
       "[107 rows x 10 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee_id</th>\n",
       "      <th>first_name</th>\n",
       "      <th>last_name</th>\n",
       "      <th>email</th>\n",
       "      <th>phone_number</th>\n",
       "      <th>job_id</th>\n",
       "      <th>salary</th>\n",
       "      <th>commission_pct</th>\n",
       "      <th>manager_id</th>\n",
       "      <th>department_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>Steven</td>\n",
       "      <td>King</td>\n",
       "      <td>SKING</td>\n",
       "      <td>515.123.4567</td>\n",
       "      <td>AD_PRES</td>\n",
       "      <td>24000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>N_ann</td>\n",
       "      <td>Kochhar</td>\n",
       "      <td>NKOCHHAR</td>\n",
       "      <td>515.123.4568</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100.0</td>\n",
       "      <td>90.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>Lex</td>\n",
       "      <td>De Haan</td>\n",
       "      <td>LDEHAAN</td>\n",
       "      <td>515.123.4569</td>\n",
       "      <td>AD_VP</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100.0</td>\n",
       "      <td>90.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>Alexander</td>\n",
       "      <td>Hunold</td>\n",
       "      <td>AHUNOLD</td>\n",
       "      <td>590.423.4567</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>9000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102.0</td>\n",
       "      <td>60.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>104</td>\n",
       "      <td>Bruce</td>\n",
       "      <td>Ernst</td>\n",
       "      <td>BERNST</td>\n",
       "      <td>590.423.4568</td>\n",
       "      <td>IT_PROG</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>103.0</td>\n",
       "      <td>60.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>202</td>\n",
       "      <td>Pat</td>\n",
       "      <td>Fay</td>\n",
       "      <td>PFAY</td>\n",
       "      <td>603.123.6666</td>\n",
       "      <td>MK_REP</td>\n",
       "      <td>6000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>201.0</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>203</td>\n",
       "      <td>Susan</td>\n",
       "      <td>Mavris</td>\n",
       "      <td>SMAVRIS</td>\n",
       "      <td>515.123.7777</td>\n",
       "      <td>HR_REP</td>\n",
       "      <td>6500.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101.0</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>204</td>\n",
       "      <td>Hermann</td>\n",
       "      <td>Baer</td>\n",
       "      <td>HBAER</td>\n",
       "      <td>515.123.8888</td>\n",
       "      <td>PR_REP</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101.0</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>205</td>\n",
       "      <td>Shelley</td>\n",
       "      <td>Higgins</td>\n",
       "      <td>SHIGGINS</td>\n",
       "      <td>515.123.8080</td>\n",
       "      <td>AC_MGR</td>\n",
       "      <td>12000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101.0</td>\n",
       "      <td>110.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>206</td>\n",
       "      <td>William</td>\n",
       "      <td>Gietz</td>\n",
       "      <td>WGIETZ</td>\n",
       "      <td>515.123.8181</td>\n",
       "      <td>AC_ACCOUNT</td>\n",
       "      <td>8300.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>205.0</td>\n",
       "      <td>110.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>107 rows × 10 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 78
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# 加载数据\n",
    "df_employees = pd.read_csv(\"data/employees.csv\")\n",
    "df_employees\n",
    "# 1. 将first_name首字母大写，其余小写\n",
    "df_employees['first_name'] = df_employees['first_name'].str.capitalize()\n",
    "\n",
    "# 2. 提取邮箱域名（@后部分）\n",
    "df_employees['email_domain'] = df_employees['email'].str.extract(r'@(.+)')\n",
    "\n",
    "print(df_employees[['first_name', 'email', 'email_domain']].head())"
   ],
   "id": "a67cd67066e146ea"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
